# Installing the rblm package to use the Bonhomme Lamadon Manresa algorithm for
# clustering firms :

install.packages("pak", repos = sprintf("https://r-lib.github.io/p/pak/stable/%s/%s/%s", .Platform$pkgType, R.Version()$os, R.Version()$arch))
library(pak);
# install the package for the main branch
pak::pkg_install("tlamadon/rblm")


require("rblm")
require(data.table)
require(dplyr)
require(tictoc)

source("AKMland_functions.R")

########################
# Clustered AKM files: #
########################
# Full panel
d <- build_panel(2002, 2007)

ad <- year2_reduce(d)

write.csv(ad[[1]], paste(clusterrep,"jdataperiod1_akm.csv", sep=""))
write.csv(ad[[2]], paste(clusterrep,"sdataperiod1_akm.csv", sep=""))

d <- build_panel(2008, 2013)

ad <- year2_reduce(d)

write.csv(ad[[1]], paste(clusterrep,"jdataperiod2_akm.csv", sep=""))
write.csv(ad[[2]], paste(clusterrep,"sdataperiod2_akm.csv", sep=""))


d <- build_panel(2014, 2019)

ad <- year2_reduce(d)

write.csv(ad[[1]], paste(clusterrep,"jdataperiod3_akm.csv", sep=""))
write.csv(ad[[2]], paste(clusterrep,"sdataperiod3_akm.csv", sep=""))

rm(d)
rm(ad)
gc()




#######################################################################################################
# Clustering firms - Cluster AKM - The final output is a dataframe with a column for the siren 
#######################################################################################################

sdata <- read.csv(paste(clusterrep,"sdataperiod1_akm.csv", sep=""))[,2:6] # stayers
jdata <- read.csv(paste(clusterrep,"jdataperiod1_akm.csv", sep=""))[,2:6] 
setDT(jdata) # define the object as a database
setDT(sdata)# movers
ad = list(jdata=jdata, sdata=sdata)

# Here are the line for the clustering, k=5000 is the number we decide ex ante.
# everything else is as in the original RBLM code. We chose 5000 because it was around
# 1% of the total number of firms
# For simulated data, 10 clusters for faster computation

ms = grouping.getMeasures(ad,"ecdf",Nw=20,y_var = "y1")
tic("groupings")
grps=grouping.classify.once(ms,k=10,nstart=100,iter.max=200,step=10)
toc()
ad=grouping.append(ad,grps$best_cluster,drop=T)

clusters<-as.data.frame(grps$best_cluster)
write.table(clusters, paste(clusterrep,"estimatedclusters_period1.csv", sep=""))

sdata <- read.csv(paste(clusterrep,"sdataperiod2_akm.csv", sep=""))[,2:6] # stayers
jdata <- read.csv(paste(clusterrep,"jdataperiod2_akm.csv", sep=""))[,2:6] 
setDT(jdata) 
setDT(sdata)
ad = list(jdata=jdata, sdata=sdata)

ms = grouping.getMeasures(ad,"ecdf",Nw=20,y_var = "y1")
tic("groupings")
grps=grouping.classify.once(ms,k=10,nstart=100,iter.max=200,step=10)
toc()
ad=grouping.append(ad,grps$best_cluster,drop=T)

clusters<-as.data.frame(grps$best_cluster)
write.table(clusters, paste(clusterrep,"estimatedclusters_period2.csv", sep=""))

sdata <- read.csv(paste(clusterrep,"sdataperiod3_akm.csv", sep=""))[,2:6] # stayers
jdata <- read.csv(paste(clusterrep,"jdataperiod3_akm.csv", sep=""))[,2:6]
setDT(jdata)
setDT(sdata)
ad = list(jdata=jdata, sdata=sdata)

ms = grouping.getMeasures(ad,"ecdf",Nw=20,y_var = "y1")
tic("groupings")
grps=grouping.classify.once(ms,k=10,nstart=100,iter.max=200,step=10)
toc()
ad=grouping.append(ad,grps$best_cluster,drop=T)

clusters<-as.data.frame(grps$best_cluster)
write.table(clusters, paste(clusterrep,"estimatedclusters_period3.csv", sep=""))
